Chapter 5 Regression Analysis

In order to forecast charging facility location selection for future, the following part will compare the charge points registration distribution with other characteristics mentioned above, including land-use, travel-related and socio-demographic characteristics, applying ordinary least squares (OLS) regression models. In order to achieve higher accuracy for the results, Spatial Lag Model (SLM) which involves a separate spatially lagged variant of the dependent variable as an independent variable of the model, as well as Spatial Error Model (SEM) which incorporates a spatial lag of the OLS regression model’s residual as an independent variable, all participate in the comparison.

5.1 Ordinary Least Squares (OLS) regression model

Find the specific relationship between those different variables

# regression model
model1 <- lm(log(userate) ~
               employment_rate+
               log(houseprice)+
               log(publictransaccess_score)+
               log(parking_density)+
               residential_percentage+
               road_percentage, 
             data = independent)
#show the summary of those outputs
summary(model1)
## 
## Call:
## lm(formula = log(userate) ~ employment_rate + log(houseprice) + 
##     log(publictransaccess_score) + log(parking_density) + residential_percentage + 
##     road_percentage, data = independent)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6909 -0.5438  0.0446  0.6646  2.4044 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  21.198355   1.151786  18.405  < 2e-16 ***
## employment_rate              -0.011648   0.006816  -1.709   0.0880 .  
## log(houseprice)              -0.922267   0.105108  -8.774  < 2e-16 ***
## log(publictransaccess_score) -1.096141   0.180330  -6.079 2.12e-09 ***
## log(parking_density)         -0.065196   0.035885  -1.817   0.0697 .  
## residential_percentage        0.025336   0.003101   8.171 1.73e-15 ***
## road_percentage              -0.056456   0.011857  -4.761 2.40e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8738 on 618 degrees of freedom
## Multiple R-squared:  0.5763, Adjusted R-squared:  0.5722 
## F-statistic: 140.1 on 6 and 618 DF,  p-value: < 2.2e-16
glance(model1)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC deviance df.residual
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1     0.576         0.572 0.874      140. 9.44e-112     6  -799. 1614. 1649.     472.         618
## # … with 1 more variable: nobs <int>
m1_tidy <- model1 %>% 
  tidy() 
m1_tidy
## # A tibble: 7 x 5
##   term                         estimate std.error statistic  p.value
##   <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                   21.2      1.15        18.4  1.21e-60
## 2 employment_rate               -0.0116   0.00682     -1.71 8.80e- 2
## 3 log(houseprice)               -0.922    0.105       -8.77 1.67e-17
## 4 log(publictransaccess_score)  -1.10     0.180       -6.08 2.12e- 9
## 5 log(parking_density)          -0.0652   0.0359      -1.82 6.97e- 2
## 6 residential_percentage         0.0253   0.00310      8.17 1.73e-15
## 7 road_percentage               -0.0565   0.0119      -4.76 2.40e- 6
# regression model
model2 <- lm(log(density) ~ 
               employment_rate+
               log(houseprice)+
               log(publictransaccess_score)+
               log(parking_density)+
               residential_percentage+
               road_percentage, 
             data = characters)
#show the summary of those outputs
summary(model2)
## 
## Call:
## lm(formula = log(density) ~ employment_rate + log(houseprice) + 
##     log(publictransaccess_score) + log(parking_density) + residential_percentage + 
##     road_percentage, data = characters)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.7962 -0.6752 -0.0135  0.5784  3.8890 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -14.161502   1.178311 -12.018  < 2e-16 ***
## employment_rate                0.014448   0.006973   2.072  0.03868 *  
## log(houseprice)                1.006913   0.107528   9.364  < 2e-16 ***
## log(publictransaccess_score)   1.101611   0.184483   5.971 3.97e-09 ***
## log(parking_density)           0.102071   0.036711   2.780  0.00559 ** 
## residential_percentage        -0.005205   0.003172  -1.641  0.10135    
## road_percentage                0.093916   0.012130   7.742 4.01e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8939 on 618 degrees of freedom
## Multiple R-squared:  0.6348, Adjusted R-squared:  0.6313 
## F-statistic: 179.1 on 6 and 618 DF,  p-value: < 2.2e-16
glance(model2)
## # A tibble: 1 x 12
##   r.squared adj.r.squared sigma statistic   p.value    df logLik   AIC   BIC deviance df.residual
##       <dbl>         <dbl> <dbl>     <dbl>     <dbl> <dbl>  <dbl> <dbl> <dbl>    <dbl>       <int>
## 1     0.635         0.631 0.894      179. 1.24e-131     6  -813. 1642. 1678.     494.         618
## # … with 1 more variable: nobs <int>

5.2 Assumptions Underpinning Linear Regression

5.2.1 Assumption 1 - There is a linear relationship between the dependent and independent variables

multiplot(lp1, lp2, p4, lp5, lp6, lp7, p9, p10, cols = 3)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.2.2 Assumption 2 - The residuals in your model should be normally distributed

#save the residuals into dataframe

model1_data <- model1 %>%
  augment(., independent)

#plot residuals
model1_data%>%
dplyr::select(.resid)%>%
  pull()%>%
  qplot()+ 
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#save the residuals into dataframe

model2_data <- model2 %>%
  augment(., independent)

#plot residuals
model2_data%>%
dplyr::select(.resid)%>%
  pull()%>%
  qplot()+ 
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

5.2.3 Assumption 3 - No Multicolinearity in the independent variables

position <- c(2:9)

Correlation_all<- independent %>%
  dplyr::select(position)%>%
    correlate()
## Note: Using an external vector in selections is ambiguous.
## ℹ Use `all_of(position)` instead of `position` to silence this message.
## ℹ See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
Correlation_all
## # A tibble: 8 x 9
##   term  density userate employment_rate houseprice publictransacce… parking_density residential_per…
##   <chr>   <dbl>   <dbl>           <dbl>      <dbl>            <dbl>           <dbl>            <dbl>
## 1 dens… NA       -0.478          0.0314     0.549             0.569          0.279           -0.0500
## 2 user… -0.478   NA              0.109     -0.339            -0.557         -0.207            0.212 
## 3 empl…  0.0314   0.109         NA          0.153            -0.186         -0.0851           0.267 
## 4 hous…  0.549   -0.339          0.153     NA                 0.463          0.0805          -0.100 
## 5 publ…  0.569   -0.557         -0.186      0.463            NA              0.303           -0.175 
## 6 park…  0.279   -0.207         -0.0851     0.0805            0.303         NA               -0.0907
## 7 resi… -0.0500   0.212          0.267     -0.100            -0.175         -0.0907          NA     
## 8 road…  0.579   -0.464         -0.224      0.342             0.761          0.298            0.215 
## # … with 1 more variable: road_percentage <dbl>
position <- c(4:9)

Correlation<- independent %>%
  dplyr::select(position)%>%
  dplyr::rename(ER="employment_rate",
         HP="houseprice",
         PS="publictransaccess_score",
         PD="parking_density",
         RSP="residential_percentage",
         RP="road_percentage"
         ) %>% 
    correlate()
## 
## Correlation method: 'pearson'
## Missing treated using: 'pairwise.complete.obs'
r <- rplot(Correlation, shape = 15,colours = c("skyblue1", "white", "indianred2"))
r
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.

png(filename = "pic/correlation.png", 
    width = 500, height = 500)     
r
## Don't know how to automatically pick scale for object of type noquote. Defaulting to continuous.
dev.off()
## quartz_off_screen 
##                 2

5.2.4 Assumption 4 - Homoscedasticity

par(mfrow=c(2,2)) 
plot(model1)

par(mfrow=c(2,2)) 
plot(model2)

5.2.5 Assumption 5 - Independence of Errors

#run durbin-watson test m1
DW1 <- durbinWatsonTest(model1)
tidy(DW1)
## # A tibble: 1 x 5
##   statistic p.value autocorrelation method             alternative
##       <dbl>   <dbl>           <dbl> <chr>              <chr>      
## 1      1.66       0           0.170 Durbin-Watson Test two.sided
#run durbin-watson test m2
DW2 <- durbinWatsonTest(model2)
tidy(DW2)
## # A tibble: 1 x 5
##   statistic p.value autocorrelation method             alternative
##       <dbl>   <dbl>           <dbl> <chr>              <chr>      
## 1      1.58       0           0.207 Durbin-Watson Test two.sided
residuals <- londonwards %>% 
  mutate(model1residual = residuals(model1)) %>% 
  mutate(model2residual = residuals(model2))
#plot the residuals m1
tmap_mode("view")
## tmap mode set to interactive viewing
qtm(residuals, fill = "model1residual")
## Variable(s) "model1residual" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.
#plot the residuals m2
tmap_mode("view")
## tmap mode set to interactive viewing
qtm(residuals, fill = "model2residual")
## Variable(s) "model2residual" contains positive and negative values, so midpoint is set to 0. Set midpoint = NA to show the full spectrum of the color palette.

5.2.6 Moran’s I test for residuals (generally)

coordsW <- residuals %>% 
    st_centroid() %>%
    st_geometry()
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
# queen
queen_nb <- residuals %>%
  poly2nb(., queen=T)

# k nearest neighbours
knn_nb <-coordsW %>%
  knearneigh(., k=9)%>%
  knn2nb()
#create spatial weights matrix 

queens_weight <- queen_nb %>%
  nb2listw(., style="C")

knn_weight <- knn_nb %>%
  nb2listw(., style="C")
Queen1 <- residuals %>%
  st_drop_geometry()%>%
  dplyr::select(model1residual)%>%
  pull()%>%
  moran.test(., queens_weight)%>%
  tidy()
Queen1
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic p.value method                           alternative
##       <dbl>     <dbl>     <dbl>     <dbl>   <dbl> <chr>                            <chr>      
## 1    0.0477  -0.00160  0.000538      2.13  0.0168 Moran I test under randomisation greater
Knn1 <- residuals %>%
  st_drop_geometry()%>%
  dplyr::select(model1residual)%>%
  pull()%>%
  moran.test(., knn_weight)%>%
  tidy()
Knn1
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic  p.value method                           alternative
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>                            <chr>      
## 1    0.0647  -0.00160  0.000324      3.69 0.000113 Moran I test under randomisation greater
Queen2 <- residuals %>%
  st_drop_geometry()%>%
  dplyr::select(model2residual)%>%
  pull()%>%
  moran.test(., queens_weight)%>%
  tidy()
Queen2
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic  p.value method                           alternative
##       <dbl>     <dbl>     <dbl>     <dbl>    <dbl> <chr>                            <chr>      
## 1    0.0726  -0.00160  0.000538      3.20 0.000685 Moran I test under randomisation greater
Knn2 <- residuals %>%
  st_drop_geometry()%>%
  dplyr::select(model2residual)%>%
  pull()%>%
  moran.test(., knn_weight)%>%
  tidy()
Knn2
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic     p.value method                           alternative
##       <dbl>     <dbl>     <dbl>     <dbl>       <dbl> <chr>                            <chr>      
## 1    0.0870  -0.00160  0.000323      4.93 0.000000415 Moran I test under randomisation greater

5.3 Spatial Regression Models

5.3.1 Spatial Lag models for Model 1 (queen)

lag_model1_queen <- lagsarlm(log(userate) ~
                               employment_rate+
                               log(houseprice)+
                               log(publictransaccess_score)+
                               log(parking_density)+
                               residential_percentage+
                               road_percentage,
                             data = independent, 
                             nb2listw(queen_nb, style="C"), 
                             method = "eigen")

tidy(lag_model1_queen)
## # A tibble: 8 x 5
##   term                         estimate std.error statistic  p.value
##   <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
## 1 rho                            0.0313   0.0188       1.67 9.54e- 2
## 2 (Intercept)                   20.8      1.17        17.7  0.      
## 3 employment_rate               -0.0123   0.00678     -1.82 6.93e- 2
## 4 log(houseprice)               -0.905    0.105       -8.63 0.      
## 5 log(publictransaccess_score)  -1.08     0.179       -6.05 1.43e- 9
## 6 log(parking_density)          -0.0609   0.0357      -1.70 8.83e- 2
## 7 residential_percentage         0.0252   0.00308      8.20 2.22e-16
## 8 road_percentage               -0.0560   0.0118      -4.76 1.93e- 6
glance(lag_model1_queen)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.578 1613. 1653.     470.  -798.   625

5.3.2 Spatial Lag models for Model 2 (queen)

lag_model2_queen <- lagsarlm(log(density) ~
                               employment_rate+
                               log(houseprice)+
                               log(publictransaccess_score)+
                               log(parking_density)+
                               residential_percentage+
                               road_percentage,
                             data = independent, 
                             nb2listw(queen_nb, style="C"), 
                             method = "eigen")

tidy(lag_model2_queen)
## # A tibble: 8 x 5
##   term                          estimate std.error statistic  p.value
##   <chr>                            <dbl>     <dbl>     <dbl>    <dbl>
## 1 rho                            0.0725    0.0323       2.24 2.48e- 2
## 2 (Intercept)                  -13.6       1.20       -11.4  0.      
## 3 employment_rate                0.0141    0.00690      2.05 4.08e- 2
## 4 log(houseprice)                0.955     0.109        8.73 0.      
## 5 log(publictransaccess_score)   1.07      0.183        5.82 6.05e- 9
## 6 log(parking_density)           0.104     0.0363       2.85 4.35e- 3
## 7 residential_percentage        -0.00437   0.00315     -1.39 1.65e- 1
## 8 road_percentage                0.0902    0.0121       7.47 8.06e-14
glance(lag_model2_queen)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.638 1639. 1679.     489.  -811.   625

5.3.3 Spatial Error models for Model 1 (queen)

error_model1_queen <- errorsarlm(log(userate) ~
                                   employment_rate+
                                   log(houseprice)+
                                   log(publictransaccess_score)+
                                   log(parking_density)+
                                   residential_percentage+
                                   road_percentage,
                                 data = independent, 
                                 nb2listw(queen_nb, style="C"), 
                                 method = "eigen")

tidy(error_model1_queen)
## # A tibble: 8 x 5
##   term                         estimate std.error statistic  p.value
##   <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                   21.3      1.17        18.2  0.      
## 2 employment_rate               -0.0104   0.00685     -1.51 1.30e- 1
## 3 log(houseprice)               -0.938    0.106       -8.81 0.      
## 4 log(publictransaccess_score)  -1.08     0.180       -6.03 1.65e- 9
## 5 log(parking_density)          -0.0728   0.0359      -2.03 4.24e- 2
## 6 residential_percentage         0.0244   0.00309      7.90 2.66e-15
## 7 road_percentage               -0.0540   0.0118      -4.60 4.32e- 6
## 8 lambda                         0.127    0.0651       1.95 5.14e- 2
glance(error_model1_queen)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.580 1612. 1652.     468.  -797.   625

5.3.4 Spatial Error models for Model 2 (queen)

error_model2_queen <- errorsarlm(log(density) ~
                                   employment_rate+
                                   log(houseprice)+
                                   log(publictransaccess_score)+
                                   log(parking_density)+
                                   residential_percentage+
                                   road_percentage,
                                 data = independent, 
                                 nb2listw(queen_nb, style="C"), 
                                 method = "eigen")

tidy(error_model2_queen)
## # A tibble: 8 x 5
##   term                          estimate std.error statistic  p.value
##   <chr>                            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                  -14.2       1.21       -11.7  0.      
## 2 employment_rate                0.0123    0.00701      1.75 7.95e- 2
## 3 log(houseprice)                1.02      0.109        9.34 0.      
## 4 log(publictransaccess_score)   1.09      0.183        5.93 2.95e- 9
## 5 log(parking_density)           0.109     0.0366       2.98 2.85e- 3
## 6 residential_percentage        -0.00358   0.00315     -1.14 2.56e- 1
## 7 road_percentage                0.0895    0.0120       7.49 6.99e-14
## 8 lambda                         0.185     0.0632       2.94 3.33e- 3
glance(error_model2_queen)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.642 1636. 1676.     484.  -809.   625

5.3.5 Find best k for KNN strategy

Use for loop to find the best k (1 to 10) for those Spatial Regression Models, and summarize the best k values, corresponding R-squared and AIC, so that they can be compared and the better results can be identified for further analysis.

5.3.5.1 The k for Spatial Lag Model 1

r_squared <- list()
best_k <- list()
AIC <- list()
# create empty variable to store fit
k <- NA
a <- NA
  
# run the k-means 10 times
for (i in 1:10){
  
  # keep track of the runs
  print(paste0('starting run: k = ', i))
  coordsW <- residuals %>% 
    st_centroid() %>%
    st_geometry()
  knn_nb <-coordsW %>%
  knearneigh(., k=i)%>%
  knn2nb()
  
  knn_weight <- knn_nb %>%
  nb2listw(., style="C")
  
  lag_model1_knni <- lagsarlm(log(userate) ~
                              employment_rate+
                              log(houseprice)+
                              log(publictransaccess_score)+
                              log(parking_density)+
                              residential_percentage+
                              road_percentage,
                            data = independent, 
                            nb2listw(knn_nb, style="C"), 
                            method = "eigen")

  k[i] <- glance(lag_model1_knni)[[1]]
  a[i] <- glance(lag_model1_knni)[[2]]
  
  # update the results of the clustering if the total within sum of squares for the run
  # is lower than any of the runs that have been executed so far 
  if (k[i] > max (k[1:(i-1)])){
    r_squared <- k[i]
    best_k <- i
    AIC <- a[i]}
}
## [1] "starting run: k = 1"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 2"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 3"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 4"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 5"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 6"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 7"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 8"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 9"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 10"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
r_squared 
## [1] 0.5846022
best_k 
## [1] 7
AIC
## [1] 1605.435

5.3.5.2 The k for Spatial Lag Model 2

r_squared2 <- list()
best_k2 <- list()
AIC2 <- list()
# create empty variable to store fit
k <- NA
a <- NA
  
# run the k-means 10 times
for (i in 1:10){
  
  # keep track of the runs
  print(paste0('starting run: k = ', i))
  coordsW <- residuals %>% 
    st_centroid() %>%
    st_geometry()
  knn_nb <-coordsW %>%
  knearneigh(., k=i)%>%
  knn2nb()
  
  knn_weight <- knn_nb %>%
  nb2listw(., style="C")
  
  lag_model2_knni <- lagsarlm(log(density) ~
                              employment_rate+
                              log(houseprice)+
                              log(publictransaccess_score)+
                              log(parking_density)+
                              residential_percentage+
                              road_percentage,
                            data = independent, 
                            nb2listw(knn_nb, style="C"), 
                            method = "eigen")

  k[i] <- glance(lag_model2_knni)[[1]]
  a[i] <- glance(lag_model2_knni)[[2]]
  
  # update the results of the clustering if the total within sum of squares for the run
  # is lower than any of the runs that have been executed so far 
  if (k[i] > max (k[1:(i-1)])){
    r_squared2 <- k[i]
    best_k2 <- i
    AIC2 <- a[i]}
}
## [1] "starting run: k = 1"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 2"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 3"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 4"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 5"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 6"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 7"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 8"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 9"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 10"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
r_squared2
## [1] 0.6466878
best_k2 
## [1] 7
AIC2
## [1] 1626.565

5.3.5.3 The k for Spatial Error Model 1

r_squared3 <- list()
best_k3 <- list()
AIC3 <- list()
# create empty variable to store fit
k <- NA
a <- NA
  
# run the k-means 10 times
for (i in 1:10){
  
  # keep track of the runs
  print(paste0('starting run: k = ', i))
  coordsW <- residuals %>% 
    st_centroid() %>%
    st_geometry()
  knn_nb <-coordsW %>%
  knearneigh(., k=i)%>%
  knn2nb()
  
  knn_weight <- knn_nb %>%
  nb2listw(., style="C")
  
  error_model1_knni <- errorsarlm(log(userate) ~
                                  employment_rate+
                                  log(houseprice)+
                                  log(publictransaccess_score)+
                                  log(parking_density)+
                                  residential_percentage+
                                  road_percentage,
                                data = independent, 
                                nb2listw(knn_nb, style="C"), 
                                method = "eigen")

  k[i] <- glance(error_model1_knni)[[1]]
  a[i] <- glance(error_model1_knni)[[2]]
  
  # update the results of the clustering if the total within sum of squares for the run
  # is lower than any of the runs that have been executed so far 
  if (k[i] > max (k[1:(i-1)])){
    r_squared3 <- k[i]
    best_k3 <- i
    AIC3 <- a[i]}
}
## [1] "starting run: k = 1"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 2"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 3"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 4"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 5"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 6"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 7"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 8"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 9"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 10"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
r_squared3
## [1] 0.5860321
best_k3 
## [1] 9
AIC3
## [1] 1605.57

5.3.5.4 The k for Spatial Error Model 2

r_squared4 <- list()
best_k4 <- list()
AIC4 <- list()
# create empty variable to store fit
k <- NA
a <- NA
  
# run the k-means 10 times
for (i in 1:10){
  
  # keep track of the runs
  print(paste0('starting run: k = ', i))
  coordsW <- residuals %>% 
    st_centroid() %>%
    st_geometry()
  knn_nb <-coordsW %>%
  knearneigh(., k=i)%>%
  knn2nb()
  
  knn_weight <- knn_nb %>%
  nb2listw(., style="C")
  
  error_model2_knni <- errorsarlm(log(density) ~
                                  employment_rate+
                                  log(houseprice)+
                                  log(publictransaccess_score)+
                                  log(parking_density)+
                                  residential_percentage+
                                  road_percentage,
                                data = independent, 
                                nb2listw(knn_nb, style="C"), 
                                method = "eigen")

  k[i] <- glance(error_model2_knni)[[1]]
  a[i] <- glance(error_model2_knni)[[2]]
  
  # update the results of the clustering if the total within sum of squares for the run
  # is lower than any of the runs that have been executed so far 
  if (k[i] > max (k[1:(i-1)])){
    r_squared4 <- k[i]
    best_k4 <- i
    AIC4 <- a[i]}
}
## [1] "starting run: k = 1"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 2"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 3"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 4"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 5"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 6"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 7"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 8"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 9"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
## [1] "starting run: k = 10"
## Warning in st_centroid.sf(.): st_centroid assumes attributes are constant over geometries of x
r_squared4 
## [1] 0.6491493
best_k4 
## [1] 9
AIC4
## [1] 1626.477

5.3.6 Spatial Error models for Model 1 (KNN; k=9)

error_model1_knn9 <- errorsarlm(log(userate) ~
                                  employment_rate+
                                  log(houseprice)+
                                  log(publictransaccess_score)+
                                  log(parking_density)+
                                  residential_percentage+
                                  road_percentage,
                                data = independent, 
                                nb2listw(knn_nb, style="C"), 
                                method = "eigen")

tidy(error_model1_knn9)
## # A tibble: 8 x 5
##   term                         estimate std.error statistic  p.value
##   <chr>                           <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                  21.3       1.20        17.8  0.      
## 2 employment_rate              -0.00991   0.00686     -1.44 1.49e- 1
## 3 log(houseprice)              -0.940     0.108       -8.67 0.      
## 4 log(publictransaccess_score) -1.10      0.179       -6.13 8.71e-10
## 5 log(parking_density)         -0.0774    0.0358      -2.16 3.06e- 2
## 6 residential_percentage        0.0238    0.00309      7.70 1.38e-14
## 7 road_percentage              -0.0523    0.0117      -4.46 8.16e- 6
## 8 lambda                        0.241     0.0805       2.99 2.77e- 3
glance(error_model1_knn9)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.585 1607. 1647.     463.  -794.   625
error_model2_knn9 <- errorsarlm(log(density) ~
                                  employment_rate+
                                  log(houseprice)+
                                  log(publictransaccess_score)+
                                  log(parking_density)+
                                  residential_percentage+
                                  road_percentage,
                                data = independent, 
                                nb2listw(knn_nb, style="C"), 
                                method = "eigen")

tidy(error_model2_knn9)
## # A tibble: 8 x 5
##   term                          estimate std.error statistic  p.value
##   <chr>                            <dbl>     <dbl>     <dbl>    <dbl>
## 1 (Intercept)                  -14.0       1.24      -11.4   0.      
## 2 employment_rate                0.0121    0.00701     1.72  8.52e- 2
## 3 log(houseprice)                1.01      0.111       9.05  0.      
## 4 log(publictransaccess_score)   1.11      0.183       6.07  1.30e- 9
## 5 log(parking_density)           0.112     0.0364      3.07  2.12e- 3
## 6 residential_percentage        -0.00298   0.00315    -0.948 3.43e- 1
## 7 road_percentage                0.0880    0.0119      7.38  1.55e-13
## 8 lambda                         0.307     0.0766      4.01  6.18e- 5
glance(error_model2_knn9)
## # A tibble: 1 x 6
##   r.squared   AIC   BIC deviance logLik  nobs
##       <dbl> <dbl> <dbl>    <dbl>  <dbl> <int>
## 1     0.647 1629. 1669.     477.  -805.   625

5.3.6.1 Check residuals

Check wether the residuals of the better model still show spatial correlation.

# residuals for spatial error model 1

residuals <- residuals %>%
  mutate(error_model1_knn9_resids = residuals(error_model1_knn9))

ErrorMoran1 <- residuals %>%
  st_drop_geometry()%>%
  dplyr::select(error_model1_knn9_resids)%>%
  pull()%>%
  moran.test(., knn_weight)%>%
  tidy()

ErrorMoran1
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic p.value method                           alternative
##       <dbl>     <dbl>     <dbl>     <dbl>   <dbl> <chr>                            <chr>      
## 1  -0.00487  -0.00160  0.000291    -0.192   0.576 Moran I test under randomisation greater
# residuals for spatial error model 2

residuals <- residuals %>%
  mutate(error_model2_knn9_resids = residuals(error_model2_knn9))

ErrorMoran2 <- residuals %>%
  st_drop_geometry()%>%
  dplyr::select(error_model2_knn9_resids)%>%
  pull()%>%
  moran.test(., knn_weight)%>%
  tidy()

ErrorMoran2
## # A tibble: 1 x 7
##   estimate1 estimate2 estimate3 statistic p.value method                           alternative
##       <dbl>     <dbl>     <dbl>     <dbl>   <dbl> <chr>                            <chr>      
## 1  -0.00716  -0.00160  0.000291    -0.326   0.628 Moran I test under randomisation greater